#ِ##################################################################################################################
# Asmaa Ali Abdelwahab Ali #
# ID:1910069 #
# CIT-651: ML_Course_Project #
# Prof. Moustafa ElAttar #
###################################################################################################################
This dataset is composed of a range of biomedical voice measurements from 31 people, 23 with Parkinson's disease (PD). Each column in the table is a particular voice measure, and each row corresponds one of 195 voice recording from these individuals. There are around six recordings per patient. ### Attribute Information:
name - ASCII subject name and recording number
# Load libraries
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from xgboost import XGBClassifier
import sklearn.metrics as metrics
import sklearn.model_selection as ms
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, power_transform
from sklearn.ensemble import VotingClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, Perceptron, ElasticNet, Lasso
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import VarianceThreshold
import warnings
warnings.filterwarnings('ignore')
# Load dataset
df1 = pd.read_csv('single term/parkinsons.data', sep = ',', header = 0)
df1.head()
# types
df1.dtypes
# Data shape
df1.shape
# Number of duplicated rows
df1.duplicated(keep='first').sum()
# The row and column indices where the value is NaN
np.where(pd.isnull(df1))
# Basic Descriptive statistics
pd.set_option('precision', 2)
df1.describe()
# class distribution
print(df1.groupby('status').size())
# Correlation
pd.set_option('precision', 2)
corrMat = df1.corr(method='pearson')
corrMat[corrMat > 0.7].count()+corrMat[corrMat < -0.7].count()
# histograms
sns.set()
df1.hist(sharex=False, sharey=False, xlabelsize=1, layout=(6,4), ylabelsize=1, figsize=(18,18))
plt.show()
# density
df1[df1.columns.difference(['status', 'name'])].plot(kind='density', subplots=True, layout=(6,4), sharex=False, legend=True, fontsize=1, figsize=(20,20))
plt.show()
# Paired plot using seaborn
sns.pairplot(df1[df1.columns.difference(['name'])], hue="status", diag_kind="kde")
# box and whisker plots
df1[df1.columns.difference(['status', 'name'])].plot(kind='box', subplots=True, layout=(6,4), sharex=False, sharey=False,fontsize=8, figsize=(20,20))
plt.show()
corrMat1 = df1[::].corr();
fig1, ax1 = plt.subplots(figsize=(20,20))
sns.heatmap(corrMat1, vmax=0.8, square = True, annot=True, ax=ax1)
def preprocess_data(file_name):
seed = 10
pd.set_option('display.max_colwidth', -1)
# Read_csv
df = pd.read_csv(file_name)
# yeo-johnson transformation
#df[df.columns.difference(['status', 'name'])] = power_transform(df[df.columns.difference(['status', 'name'])], method='yeo-johnson')
# prepare i/p, o/p data
X = df.loc[:,df.columns.difference(['status', 'name'])]
y = df.loc[:,'status']
print("\nThe number of features before removing correlated features is: ", len(X.columns))
# Remove correlated features
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = (column for column in upper.columns if any(upper[column] > 0.95))
X = X.drop(to_drop, axis =1)
print("\nThe number of features after removing correlated features is: ", len(X.columns),'\n')
for i in X.columns:
print('\t',i)
# Split data to train and test
x_train, x_test, y_train, y_test = ms.train_test_split(X, y, test_size = 0.2, random_state = 0)
return(X, y, x_train, x_test, y_train, y_test)
X, y, x_train, x_test, y_train, y_test = preprocess_data('single term/parkinsons.data')
from xgboost import XGBClassifier
from xgboost import plot_importance
# split data into X and y
X = df1.loc[:,df1.columns.difference(['status', 'name'])]
y = df1.loc[:,'status']
# fit model no training data
model = XGBClassifier()
model.fit(X, y)
# plot feature importance
plot_importance(model)
plt.savefig('feature_importance.png')
plt.show()
# Test options and evaluation metric
num_folds = 10
seed = 7
scoring = 'accuracy'
# Check Algorithms
models = []
#Linear Discriminative Models
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('PRC', Perceptron()))
#Linear Generative Models
models.append(('LR',LogisticRegression()))
models.append(('NB', GaussianNB()))
#Nonlinear models
models.append(('KNN', KNeighborsClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('SVM', SVC(probability=True)))
models.append(('RF', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('MLP', MLPClassifier()))
# evaluate each model in turn
results = []
names = []
acc = []
for name, model in models:
kfold = ms.KFold(n_splits=num_folds, random_state=seed)
cv_results = ms.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
acc.append(cv_results.mean())
# Check Algorithms
pipelines = []
#Linear Discriminative Models
pipelines.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()), ('LDA', LinearDiscriminantAnalysis())])))
pipelines.append(('ScaledPRC', Pipeline([('Scaler', StandardScaler()), ('PRC', Perceptron())])))
#Linear Generative Models
pipelines.append(('ScaledLR',Pipeline([('Scaler', StandardScaler()),('LR',LogisticRegression())])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()), ('GB', GaussianNB())])))
#Nonlinear models
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsClassifier())])))
pipelines.append(('ScaledDT', Pipeline([('Scaler', StandardScaler()), ('DT', DecisionTreeClassifier())])))
pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()), ('SVC', SVC(probability=True))])))
pipelines.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()), ('RF', RandomForestClassifier())])))
pipelines.append(('ScaledGB', Pipeline([('Scaler', StandardScaler()), ('GB', GradientBoostingClassifier())])))
pipelines.append(('ScaledMLP', Pipeline([('Scaler', StandardScaler()), ('MLP', MLPClassifier())])))
# evaluate each model in turn
results_scaled = []
names_scaled = []
acc_scaled = []
for name, model in pipelines:
kfold = ms.KFold(n_splits=num_folds, random_state=seed)
cv_results = ms.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
results_scaled.append(cv_results)
names_scaled.append(name)
acc_scaled.append(cv_results.mean())
#Print accuracies before and after scaling
acc = pd.DataFrame(list(zip(names,acc,acc_scaled)), columns=['Algorithm','Accuracy before scaling','Accuracy after scaling'])
acc
# Compare Algorithms
plt.figure(figsize=(15,5))
plt.subplot(1, 2, 1)
plt.boxplot(results)
plt.title('Algorithm Comparison before scaling', size='large')
plt.xticks(range(len(names)+1), ['']+names, size='small')
plt.yticks(np.arange(0,1.2,0.1))
plt.subplot(1, 2, 2)
plt.boxplot(results_scaled)
plt.title('Algorithm Comparison after scaling')
plt.xticks(range(len(names)+1), ['']+names, size='small')
plt.yticks(np.arange(0,1.2,0.1))
plt.tight_layout()
plt.show()
# KNN Algorithm tuning
scaler = StandardScaler().fit(x_train)
rescaledX = scaler.transform(x_train)
k_values = np.array([1,3,5,7,9,11,13,15,17,19,21,23,25])
param_grid = dict(n_neighbors=k_values)
model = KNeighborsClassifier()
kfold = ms.KFold(n_splits=num_folds, random_state=seed)
grid = ms.GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(rescaledX, y_train)
print("\nBest: %f using %s\n" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean, param in zip(means, params):
print("%f with: %r" % (mean, param))
# GBM Algorithm tuning
param_grid = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 'n_estimators':[50,100,150,200,250,300,400]}
scaler = StandardScaler().fit(x_train)
rescaledX = scaler.transform(x_train)
model = GradientBoostingClassifier()
kfold = ms.KFold(n_splits=num_folds, random_state=seed)
grid = ms.GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(rescaledX, y_train)
print("\nBest: %f using %s\n" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean, param in zip(means, params):
print("%f with: %r" % (mean, param))
# ensembles
ensembles = []
ensembles.append(('ScaledAB', Pipeline([('Scaler', StandardScaler()),('AB', AdaBoostClassifier())])))
ensembles.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingClassifier())])))
ensembles.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()),('RF', RandomForestClassifier())])))
ensembles.append(('ScaledET', Pipeline([('Scaler', StandardScaler()),('ET', ExtraTreesClassifier())])))
results = []
names = []
for name, model in ensembles:
kfold = ms.KFold(n_splits=num_folds, random_state=seed)
cv_results = ms.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f" % (name, cv_results.mean())
print(msg)
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Scaled Ensemble Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# Tune scaled ET
scaler = StandardScaler().fit(x_train)
rescaledX = scaler.transform(x_train)
param_grid = dict(n_estimators=np.array([50,100,150,200,250,300,350,400]))
model = ExtraTreesClassifier(random_state=seed)
kfold = ms.KFold(n_splits=num_folds, random_state=seed)
grid = ms.GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(rescaledX, y_train)
print("\nBest: %f using %s\n" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, param in zip(means, params):
print("%f with: %r" % (mean, param))
#Get the features and labels
features=df1.loc[:,df1.columns!='status'].values[:,1:]
labels=df1.loc[:,'status'].values
#Get the count of each label (0 and 1) in labels
print(labels[labels==1].shape[0], labels[labels==0].shape[0])
#Scale the features to between -1 and 1
scaler=MinMaxScaler((-1,1))
x=scaler.fit_transform(features)
y=labels
#Split the dataset
x_train,x_test,y_train,y_test=ms.train_test_split(x, y, test_size=0.2, random_state=7)
#Train the model - eXtreme Gradient Boosting- using gradient boosting algorithms
model=XGBClassifier()
model.fit(x_train,y_train)
#Calculate the accuracy
y_pred=model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred)*100)
# Tune scaled XGBClassifier
scaler = StandardScaler().fit(x_train)
rescaledX = scaler.transform(x_train)
param_grid = dict(n_estimators=np.array([150,200,250,300,350]),
learning_rate=np.array([0.001,0.01,0.05,0.1,1,10]),
gamma=np.array([0.001,0.01,0.05,0.1,0, 1]))
model = XGBClassifier(random_state=seed)
kfold = ms.KFold(n_splits=num_folds, random_state=seed)
grid = ms.GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(rescaledX, y_train)
print("\nBest: %f using %s\n" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, param in zip(means, params):
print("%f with: %r" % (mean, param))
# prepare the model
scaler = StandardScaler().fit(x_train)
rescaledX = scaler.transform(x_train)
model = KNeighborsClassifier(n_neighbors=1)
model.fit(rescaledX, y_train)
# transform the validation dataset
rescaledXtest = scaler.transform(x_test)
y_pred = model.predict(rescaledXtest)
def cm_analysis(y_true, y_pred, filename, labels, ymap=None, figsize=(10,10)):
if ymap is not None:
y_pred = [ymap[yi] for yi in y_pred]
y_true = [ymap[yi] for yi in y_true]
labels = [ymap[yi] for yi in labels]
cm = metrics.confusion_matrix(y_true, y_pred, labels=labels)
cm_sum = np.sum(cm, axis=1, keepdims=True)
cm_perc = cm / cm_sum.astype(float) * 100
annot = np.empty_like(cm).astype(str)
nrows, ncols = cm.shape
for i in range(nrows):
for j in range(ncols):
c = cm[i, j]
p = cm_perc[i, j]
if i == j:
s = cm_sum[i]
annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
elif c == 0:
annot[i, j] = ''
else:
annot[i, j] = '%.1f%%\n%d' % (p, c)
cm = pd.DataFrame(cm, index=labels, columns=labels)
cm.index.name = 'Actual'
cm.columns.name = 'Predicted'
fig, ax = plt.subplots(figsize=figsize)
sns.heatmap(cm, annot=annot, fmt='', ax=ax)
plt.savefig(filename)
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred), '\n')
print(metrics.classification_report(y_test, y_pred))
false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
plt.figure(figsize=(8,6))
plt.title('Receiver Operating Characteristic for KNN')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
print('\n')
cm_analysis(y_test, y_pred, 'Confusion.PNG', [0, 1], ymap=None, figsize=(10,8))